suppressPackageStartupMessages ({
library (SingleCellExperiment)
library (scran)
library (scater)
library (batchelor)
library (scDblFinder)
library (sctransform)
library (muscat)
library (SEtools)
library (cowplot)
library (BiocParallel)
library (BiocNeighbors)
library (ComplexHeatmap)
library (igraph)
library (sechm)
})
Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
TMB was built with Matrix version 1.4.1
Current Matrix version is 1.5.3
Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
Preprocessing and Clustering
Workflow inspired by: https://bioconductor.org/books/release/OSCA/ and the materials from the course STA426 - UZH.
Loading the data
sce <- readRDS ("week13.SCE.rds" )
table (sce$ sample_id, sce$ group_id)
WT LPS
LC016_WT 1730 0
LC017_LPS 0 1027
LC019_WT 1064 0
LC020_LPS 0 1468
LC022_WT 1806 0
LC023_LPS 0 1546
LC025_WT 1259 0
LC026_LPS 0 2247
class: SingleCellExperiment
dim: 27998 12147
metadata(0):
assays(1): counts
rownames(27998): ENSMUSG00000051951.Xkr4 ENSMUSG00000089699.Gm1992 ...
ENSMUSG00000096730.Vmn2r122 ENSMUSG00000095742.CAAA01147332.1
rowData names(2): ENSEMBL SYMBOL
colnames(12147): AAACCTGCAAAGCGGT-1.LC017 AAACCTGCACGGACAA-1.LC017 ...
TTTGTCACATCTATGG-1.LC025 TTTGTCAGTCATATGC-1.LC025
colData names(3): sample_id barcode group_id
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
# counts assay
counts (sce)[41 : 45 ,45 : 50 ]
5 x 6 sparse Matrix of class "dgCMatrix"
AAGGAGCAGGAATTAC-1.LC017
ENSMUSG00000042501.Cpa6 .
ENSMUSG00000048960.Prex2 2
ENSMUSG00000057715.A830018L16Rik .
ENSMUSG00000097171.Gm17644 .
ENSMUSG00000101314.Gm29663 .
AAGGAGCGTAGGAGTC-1.LC017
ENSMUSG00000042501.Cpa6 .
ENSMUSG00000048960.Prex2 .
ENSMUSG00000057715.A830018L16Rik 2
ENSMUSG00000097171.Gm17644 .
ENSMUSG00000101314.Gm29663 .
AAGGCAGTCTCTGAGA-1.LC017
ENSMUSG00000042501.Cpa6 .
ENSMUSG00000048960.Prex2 .
ENSMUSG00000057715.A830018L16Rik 1
ENSMUSG00000097171.Gm17644 .
ENSMUSG00000101314.Gm29663 .
AAGGTTCAGCTAGTTC-1.LC017
ENSMUSG00000042501.Cpa6 .
ENSMUSG00000048960.Prex2 .
ENSMUSG00000057715.A830018L16Rik 2
ENSMUSG00000097171.Gm17644 .
ENSMUSG00000101314.Gm29663 .
AAGGTTCAGCTCTCGG-1.LC017
ENSMUSG00000042501.Cpa6 1
ENSMUSG00000048960.Prex2 .
ENSMUSG00000057715.A830018L16Rik 2
ENSMUSG00000097171.Gm17644 .
ENSMUSG00000101314.Gm29663 .
AAGTCTGGTGTTTGTG-1.LC017
ENSMUSG00000042501.Cpa6 1
ENSMUSG00000048960.Prex2 .
ENSMUSG00000057715.A830018L16Rik 2
ENSMUSG00000097171.Gm17644 .
ENSMUSG00000101314.Gm29663 .
# cells names and metadata
head (colnames (sce))
[1] "AAACCTGCAAAGCGGT-1.LC017" "AAACCTGCACGGACAA-1.LC017"
[3] "AAACCTGTCGCAAACT-1.LC017" "AAACGGGGTATTCGTG-1.LC017"
[5] "AAACGGGGTCGCGGTT-1.LC017" "AAACGGGGTGGTTTCA-1.LC017"
DataFrame with 6 rows and 3 columns
sample_id barcode group_id
<factor> <character> <factor>
AAACCTGCAAAGCGGT-1.LC017 LC017_LPS AAACCTGCAAAGCGGT-1 LPS
AAACCTGCACGGACAA-1.LC017 LC017_LPS AAACCTGCACGGACAA-1 LPS
AAACCTGTCGCAAACT-1.LC017 LC017_LPS AAACCTGTCGCAAACT-1 LPS
AAACGGGGTATTCGTG-1.LC017 LC017_LPS AAACGGGGTATTCGTG-1 LPS
AAACGGGGTCGCGGTT-1.LC017 LC017_LPS AAACGGGGTCGCGGTT-1 LPS
AAACGGGGTGGTTTCA-1.LC017 LC017_LPS AAACGGGGTGGTTTCA-1 LPS
# genes names and metadata
head (rownames (sce))
[1] "ENSMUSG00000051951.Xkr4" "ENSMUSG00000089699.Gm1992"
[3] "ENSMUSG00000102343.Gm37381" "ENSMUSG00000025900.Rp1"
[5] "ENSMUSG00000109048.Rp1" "ENSMUSG00000025902.Sox17"
[1] "ENSMUSG00000051951.Xkr4" "ENSMUSG00000089699.Gm1992"
[3] "ENSMUSG00000102343.Gm37381" "ENSMUSG00000025900.Rp1"
[5] "ENSMUSG00000109048.Rp1" "ENSMUSG00000025902.Sox17"
DataFrame with 6 rows and 2 columns
ENSEMBL SYMBOL
<character> <character>
ENSMUSG00000051951.Xkr4 ENSMUSG00000051951 Xkr4
ENSMUSG00000089699.Gm1992 ENSMUSG00000089699 Gm1992
ENSMUSG00000102343.Gm37381 ENSMUSG00000102343 Gm37381
ENSMUSG00000025900.Rp1 ENSMUSG00000025900 Rp1
ENSMUSG00000109048.Rp1 ENSMUSG00000109048 Rp1
ENSMUSG00000025902.Sox17 ENSMUSG00000025902 Sox17
QC
# mitochondrial genes:
mito <- grep ("mt-" , rownames (sce), value = TRUE )
# get QC metrics:
sce <- addPerCellQC (sce, subsets= list (Mt= mito), percent.top= c (5 ,10 ))
head (colData (sce))
DataFrame with 6 rows and 11 columns
sample_id barcode group_id sum
<factor> <character> <factor> <numeric>
AAACCTGCAAAGCGGT-1.LC017 LC017_LPS AAACCTGCAAAGCGGT-1 LPS 1386
AAACCTGCACGGACAA-1.LC017 LC017_LPS AAACCTGCACGGACAA-1 LPS 1193
AAACCTGTCGCAAACT-1.LC017 LC017_LPS AAACCTGTCGCAAACT-1 LPS 738
AAACGGGGTATTCGTG-1.LC017 LC017_LPS AAACGGGGTATTCGTG-1 LPS 906
AAACGGGGTCGCGGTT-1.LC017 LC017_LPS AAACGGGGTCGCGGTT-1 LPS 1743
AAACGGGGTGGTTTCA-1.LC017 LC017_LPS AAACGGGGTGGTTTCA-1 LPS 1450
detected percent.top_5 percent.top_10 subsets_Mt_sum
<integer> <numeric> <numeric> <numeric>
AAACCTGCAAAGCGGT-1.LC017 935 10.24531 13.70851 22
AAACCTGCACGGACAA-1.LC017 776 17.60268 20.78793 0
AAACCTGTCGCAAACT-1.LC017 538 15.71816 18.83469 5
AAACGGGGTATTCGTG-1.LC017 683 6.62252 9.60265 29
AAACGGGGTCGCGGTT-1.LC017 1103 13.88411 16.86747 5
AAACGGGGTGGTTTCA-1.LC017 1073 9.03448 10.96552 12
subsets_Mt_detected subsets_Mt_percent total
<integer> <numeric> <numeric>
AAACCTGCAAAGCGGT-1.LC017 7 1.587302 1386
AAACCTGCACGGACAA-1.LC017 0 0.000000 1193
AAACCTGTCGCAAACT-1.LC017 4 0.677507 738
AAACGGGGTATTCGTG-1.LC017 8 3.200883 906
AAACGGGGTCGCGGTT-1.LC017 5 0.286862 1743
AAACGGGGTGGTTTCA-1.LC017 6 0.827586 1450
sce <- addPerFeatureQC (sce)
head (rowData (sce))
DataFrame with 6 rows and 4 columns
ENSEMBL SYMBOL mean detected
<character> <character> <numeric> <numeric>
ENSMUSG00000051951.Xkr4 ENSMUSG00000051951 Xkr4 1.54317939 50.012349
ENSMUSG00000089699.Gm1992 ENSMUSG00000089699 Gm1992 0.25693587 18.202025
ENSMUSG00000102343.Gm37381 ENSMUSG00000102343 Gm37381 0.00115255 0.115255
ENSMUSG00000025900.Rp1 ENSMUSG00000025900 Rp1 0.00650366 0.633901
ENSMUSG00000109048.Rp1 ENSMUSG00000109048 Rp1 0.00000000 0.000000
ENSMUSG00000025902.Sox17 ENSMUSG00000025902 Sox17 0.00773854 0.609204
# plotting some of the metrics
qc <- as.data.frame (colData (sce))
ggplot (qc, aes (subsets_Mt_percent)) + geom_histogram () + facet_wrap (~ sample_id)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (qc, aes (log10 (sum))) + geom_histogram () + facet_wrap (~ sample_id)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot (qc, aes (log10 (sum), log10 (detected))) + geom_point () + geom_density2d ()
# Set thresholds on library sizes and detection rate:
sce$ qc.out <- isOutlier (log (sce$ sum),nmads= 3 ,batch= sce$ sample_id) |
isOutlier (log (sce$ detected),nmads= 3 ,batch= sce$ sample_id)
table (sce$ qc.out)
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11349446 606.2 17705992 945.7 NA 17705992 945.7
Vcells 46164769 352.3 93715150 715.0 16384 90520856 690.7
sce <- scDblFinder (sce, samples= "sample_id" )
table (sce$ scDblFinder.class)
singlet doublet
11447 700
Normalization, filtering, reduction
# getting rid of genes that were rarely identified
sce <- sce[rowData (sce)$ detected>= 4 ,]
dim (sce)
# standard log-normalization
#sce <- logNormCounts(sce)
# variance-stabilizing transformation
vst <- suppressWarnings (sctransform:: vst (counts (sce)))
Calculating cell attributes from input UMI matrix: log_umi
Variance stabilizing transformation of count matrix of size 8887 by 12147
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 12147 cells
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
Found 115 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 8887 genes
|
| | 0%
|
|==== | 6%
|
|======== | 11%
|
|============ | 17%
|
|================ | 22%
|
|=================== | 28%
|
|======================= | 33%
|
|=========================== | 39%
|
|=============================== | 44%
|
|=================================== | 50%
|
|======================================= | 56%
|
|=========================================== | 61%
|
|=============================================== | 67%
|
|=================================================== | 72%
|
|====================================================== | 78%
|
|========================================================== | 83%
|
|============================================================== | 89%
|
|================================================================== | 94%
|
|======================================================================| 100%
Calculating gene attributes
Wall clock passed: Time difference of 2.832951 mins
logcounts (sce) <- vst$ y
# get highly-variable genes
hvg <- row.names (sce)[order (vst$ gene_attr$ residual_variance, decreasing= TRUE )[1 : 2000 ]]
# run PCA
sce <- runPCA (sce, subset_row= hvg)
# check the variance explained by the PCs:
pc.var <- attr (reducedDim (sce),"percentVar" )
plot (pc.var, xlab= "PCs" , ylab= "% variance explained" )
# restrict to the first 20 components:
reducedDim (sce) <- reducedDim (sce)[,1 : 20 ]
# run and plot 2d projections based on the PCA:
sce <- runTSNE (sce, dimred= "PCA" )
sce <- runUMAP (sce, dimred= "PCA" , n_neighbors= 30 )
# plot by doublet score
plot_grid (plotUMAP (sce, colour_by= "scDblFinder.score" ),
plotTSNE (sce, colour_by= "scDblFinder.score" ))
# filter out low quality cells
sce <- sce[,sce$ scDblFinder.class!= "doublet" & ! sce$ qc.out]
# check mixing:
plot_grid (plotTSNE (sce[, sample (1 : ncol (sce), ncol (sce))], colour_by= "group_id" ),
plotTSNE (sce[, sample (1 : ncol (sce), ncol (sce))], colour_by= "sample_id" ))
Batch correction
# batch corrected - mutual nearest neighbors
sce2 <- fastMNN (sce, batch= sce$ sample_id, BNPARAM= AnnoyParam ())
# corrected PCA
reducedDim (sce, type= "PCA" ) <- reducedDim (sce2)[,1 : 20 ]
# recompute the 2d projections
sce <- runTSNE (sce, dimred= "PCA" )
sce <- runUMAP (sce, dimred= "PCA" , n_neighbors= 25 )
plot_grid (plotTSNE (sce[, sample (1 : ncol (sce), ncol (sce))], colour_by= "group_id" ),
plotTSNE (sce[, sample (1 : ncol (sce), ncol (sce))], colour_by= "sample_id" ))
Clustering
g <- buildKNNGraph (sce, BNPARAM= AnnoyParam (), use.dimred= "PCA" , k= 30 )
sce$ cluster <- as.factor (cluster_louvain (g)$ membership)
# lower resolution
g <- buildKNNGraph (sce, BNPARAM= AnnoyParam (), use.dimred= "PCA" , k= 300 )
sce$ cluster2 <- as.factor (cluster_louvain (g)$ membership)
saveRDS (sce, file= "week13.SCE.processed.rds" )
plot_grid (plotTSNE (sce, colour_by= "cluster" , text_by= "cluster" ),
plotTSNE (sce, colour_by= "cluster2" , text_by= "cluster2" ))
Cluster abundances across samples
ca <- table (cluster= sce$ cluster, sample= sce$ sample_id)
ggplot (as.data.frame (ca), aes (sample, cluster, fill= Freq)) + geom_tile () + geom_text (aes (label= Freq))
Cluster annotation
De-novo markers identification
# identify genes that differ between clusters:
mm <- findMarkers (sce, groups= sce$ cluster, test.type= "binom" )
# select the top 5 markers by cluster:
markers <- unique (unlist (lapply (mm, FUN= function (x){
head (row.names (x[x$ FDR< 0.01 ,]),5 )
})))
Known markers
genes <- list (
astrocytes = c ("Aqp4" , "Gfap" , "Fgfr3" ,"Dio2" ),
endothelial = c ("Cldn5" ,"Nostrin" ,"Flt1" ),
microglia = c ("C1qb" ,"Tyrobp" ,"P2ry12" , "Csf1r" , "Irf8" ),
neuronal = c ("Snap25" , "Stmn2" , "Syn1" , "Rbfox3" ),
excNeuron = c ("Slc17a7" ,"Camk2a" ,"Grin2b" ,"Fezf2" ),
inhNeuron = c ("Gad1" ,"Lhx6" ,"Adarb2" ),
oligodendro = c ("Opalin" ,"Plp1" ,"Mag" ,"Mog" ),
OPC = c ("Pdgfra" ,"Sox6" ,"Bcan" )
)
# find the matching row names for each gene:
km <- lapply (genes, FUN= function (g) grep (paste0 (g, "$" , collapse= "|" ), rownames (sce), value= TRUE ))
km
$astrocytes
[1] "ENSMUSG00000054252.Fgfr3" "ENSMUSG00000020932.Gfap"
[3] "ENSMUSG00000024411.Aqp4"
$endothelial
[1] "ENSMUSG00000029648.Flt1" "ENSMUSG00000041378.Cldn5"
$microglia
[1] "ENSMUSG00000036353.P2ry12" "ENSMUSG00000036905.C1qb"
$neuronal
[1] "ENSMUSG00000027273.Snap25" "ENSMUSG00000037217.Syn1"
[3] "ENSMUSG00000027500.Stmn2" "ENSMUSG00000025576.Rbfox3"
$excNeuron
[1] "ENSMUSG00000030209.Grin2b" "ENSMUSG00000070570.Slc17a7"
[3] "ENSMUSG00000021743.Fezf2" "ENSMUSG00000024617.Camk2a"
$inhNeuron
[1] "ENSMUSG00000070880.Gad1" "ENSMUSG00000052551.Adarb2"
$oligodendro
[1] "ENSMUSG00000031425.Plp1" "ENSMUSG00000036634.Mag"
[3] "ENSMUSG00000076439.Mog" "ENSMUSG00000050121.Opalin"
$OPC
[1] "ENSMUSG00000004892.Bcan" "ENSMUSG00000029231.Pdgfra"
[3] "ENSMUSG00000051910.Sox6"
Pseudo-bulk aggregation
# mean logcounts by cluster:
pb <- aggregateData (sce, "logcounts" , by= c ("cluster" ), fun= "mean" )
assayNames (pb) <- "logcounts"
rowData (pb)$ marker4 <- NA
rowData (pb)[unlist (km),"marker4" ] <- rep (names (km),lengths (km))
sechm:: sechm (pb, unlist (km), assayName = "logcounts" , gaps_row = "marker4" , show_colnames = TRUE , do.scale = TRUE , breaks= 1 , row_title_rot= 0 )
# heatmap of the mean logcounts of the known markers:
h1 <- pheatmap (assay (pb)[unlist (km),], annotation_row= data.frame (row.names= unlist (km), type= rep (names (km), lengths (km))), split= rep (names (km), lengths (km)), cluster_rows= FALSE , scale= "row" )
# heatmap for the de-novo markers:
h2 <- pheatmap (assay (pb)[markers,], scale= "row" )
# Assign clusters to the cell type whose markers are the most expressed
# get rid of the unspecific neuronal markers:
km <- km[names (km)!= "neuronal" ]
# extract the pseudo-bulk counts of the markers for each cluster
mat <- assay (pb)[unlist (km),]
# aggregate across markers of the same type
mat <- aggregate (t (scale (t (mat))), by= list (type= rep (names (km), lengths (km))), FUN= sum)
# for each column (cluster), select the row (cell type) which has the maximum aggregated value
cl2 <- mat[,1 ][apply (mat[,- 1 ], 2 , FUN= which.max)]
# convert the cells' cluster labels to cell type labels:
sce$ cluster2 <- cl2[sce$ cluster]
# Aggregate to pseudo-bulk using the new clusters
pb <- aggregateData (sce, "logcounts" , by= c ("cluster2" ), fun= "mean" )
# Plot the expression of the markers as a sanity check
h1 <- pheatmap (assay (pb)[unlist (km),], annotation_row= data.frame (row.names= unlist (km), type= rep (names (km), lengths (km))), split= rep (names (km), lengths (km)), cluster_rows= FALSE , scale= "row" )
plotTSNE (sce, colour_by= "cluster2" , text_by= "cluster2" )
Differential state analysis
# Aggregate by cluster x sample to perform pseudo-bulk differential state analysis
sce <- muscat:: prepSCE (sce, kid= "cluster2" )
pb <- aggregateData (sce)
assays (pb)
List of length 7
names(7): astrocytes endothelial excNeuron inhNeuron microglia oligodendro OPC